library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggridges)
library(hexbin)

Load the weather data

#rnoaa package allow to access public data online that is aggregated by the noaa.com。 就是直接可以从网站获得数据的办法
#USWOOO94728这些都是weather station
weather_df = 
  rnoaa::meteo_pull_monitors(
    c("USW00094728", "USC00519397", "USS0023B17S"),
    var = c("PRCP", "TMIN", "TMAX"), 
    date_min = "2017-01-01",
    date_max = "2017-12-31") %>%
  mutate(
    name = recode(
      id, 
      USW00094728 = "CentralPark_NY", 
      USC00519397 = "Waikiki_HA",
      USS0023B17S = "Waterhole_WA"),
    tmin = tmin / 10,
    tmax = tmax / 10) %>%
  select(name, id, everything())
## Registered S3 method overwritten by 'hoardr':
##   method           from
##   print.cache_info httr
## using cached file: /Users/yiming/Library/Caches/R/noaa_ghcnd/USW00094728.dly
## date created (size, mb): 2020-10-07 16:20:31 (7.525)
## file min/max dates: 1869-01-01 / 2020-10-31
## using cached file: /Users/yiming/Library/Caches/R/noaa_ghcnd/USC00519397.dly
## date created (size, mb): 2020-10-07 16:20:49 (1.699)
## file min/max dates: 1965-01-01 / 2020-03-31
## using cached file: /Users/yiming/Library/Caches/R/noaa_ghcnd/USS0023B17S.dly
## date created (size, mb): 2020-10-07 16:21:03 (0.88)
## file min/max dates: 1999-09-01 / 2020-10-31
weather_df
## # A tibble: 1,095 x 6
##    name           id          date        prcp  tmax  tmin
##    <chr>          <chr>       <date>     <dbl> <dbl> <dbl>
##  1 CentralPark_NY USW00094728 2017-01-01     0   8.9   4.4
##  2 CentralPark_NY USW00094728 2017-01-02    53   5     2.8
##  3 CentralPark_NY USW00094728 2017-01-03   147   6.1   3.9
##  4 CentralPark_NY USW00094728 2017-01-04     0  11.1   1.1
##  5 CentralPark_NY USW00094728 2017-01-05     0   1.1  -2.7
##  6 CentralPark_NY USW00094728 2017-01-06    13   0.6  -3.8
##  7 CentralPark_NY USW00094728 2017-01-07    81  -3.2  -6.6
##  8 CentralPark_NY USW00094728 2017-01-08     0  -3.8  -8.8
##  9 CentralPark_NY USW00094728 2017-01-09     0  -4.9  -9.9
## 10 CentralPark_NY USW00094728 2017-01-10     0   7.8  -6  
## # … with 1,085 more rows

Scatterplots

create my first scatterplot ever

ggplot(weather_df, aes(x = tmin, y = tmax)) +
  geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).

New approach, same plot.

#这个办法更好,因为可以再dataset跟ggplot之间加mutate function之类的,不需要单独再做一个新的dataframe
weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) +
  geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).

save and edit a plot object

weather_plot =
  weather_df %>% 
  ggplot(aes(x = tmin, y = tmax))

weather_plot + geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).

Advanced scatterplot…

Start with the same one and make it fancy!

#geom_smooth add smooth trajectories through the data。 且出现的是多条curve,curve数量取决于name的数量(仅此题)
weather_df %>% 
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_point() + 
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

What about the aes placement

#这个办法让scatterplot跟前一个图没啥区别。但是color only applies to the scatterplot 导致smooth curve仅出现一条,不会因为name区分而出现多个
weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) +
  geom_point(aes(color = name)) +
  geom_smooth(se = FALSE)  
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

Let’s facet some things

#facet_grid(): create multi panel plots, function里面填写what define on rows,columns,若没有就用“."代替,若有就用“~XXXX”代替。 facet_grid 括号里面按顺序第一个works on rows, 第二个works on columns
#geom_point(alpha = 0.5) 代表ggplot里面的点需要50%transparent,还可以让alpha = 某个variable(例如alpha = tmin,就能随着tmin变化出渐变色)
#同时通过size = .3让point 小一些
weather_df %>% 
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_point(alpha = 0.5, size = .3) + 
  geom_smooth(se = FALSE) +
  facet_grid(. ~name)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

Let’s combine some elements and try a new plot

#size根据preticipates 变化
weather_df %>% 
  ggplot(aes(x = date, y =tmax, color = name)) +
  geom_point(aes(size = prcp), alpha = .5) +
  geom_smooth(se = FALSE) +
  facet_grid(. ~name)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

Some small notes

How many geoms have to exit?

You can hace whatever geoms you want.

#可以直接创造smooth cruve without scatterplot
weather_df %>% 
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).

You can use a neat geom!

#把一样位置的点以count的方式记录,根据count进行颜色渐变
weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) +
  geom_hex()
## Warning: Removed 15 rows containing non-finite values (stat_binhex).

#做2D density 图,有点像地图等高线的那个
weather_df %>% 
  ggplot(aes(x = tmin, y = tmax)) +
  geom_density_2d()+
  geom_point(alpha = .3)
## Warning: Removed 15 rows containing non-finite values (stat_density2d).
## Warning: Removed 15 rows containing missing values (geom_point).

Univariate plots, means one variable at a time

Histograms are really great.

weather_df %>% 
  ggplot(aes(x = tmin)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).

Can we add color…

#position ="dodge" 代表不想要不同name的数据竖着堆叠,而是想要他们一左一右挨着
#fill 在这里跟scatterplot里面的color意义差不多,但是在histogram里面用fill
weather_df %>% 
  ggplot(aes(x = tmin, fill = name)) +
  geom_histogram(position ="dodge") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).

weather_df %>% 
  ggplot(aes(x = tmin, fill = name)) +
  geom_histogram()+
  facet_grid(. ~ name)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).

Let’s try a new geometry!

#geom_density: is a histogram that's been smoothed out around the edges, 这个可以帮助我们loss一些不必要的data,如果我们想保留更多可以用adjust
weather_df %>% 
  ggplot(aes(x = tmin, fill = name)) +
  geom_density(alpha = .3, adjust = .5)
## Warning: Removed 15 rows containing non-finite values (stat_density).

What about box plots?

weather_df %>% 
  ggplot(aes(x = name, y = tmin)) +
  geom_boxplot()
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).

Trendy plots

#violin plot 有点像boxplot跟density plot的结合体
#利用stat_summary function 可以在图上临时加statistical data
weather_df %>% 
  ggplot(aes(x = name, y = tmin, fill = name)) +
  geom_violin(alpha = .5) +
  stat_summary(fun = "median")
## Warning: Removed 15 rows containing non-finite values (stat_ydensity).
## Warning: Removed 15 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_segment).

Ridge plots – the most popular plot of 2017

#ridge plots 是density plot的一种,不是像density plot把一个categorical predictor的不同variable的几个density plot叠加在一起,而是分开来看。
weather_df %>% 
  ggplot(aes(x = tmin, y = name)) +
  geom_density_ridges()
## Picking joint bandwidth of 1.67
## Warning: Removed 15 rows containing non-finite values (stat_density_ridges).

Save and Embed

Let’s sabe a scatterplot.

#ggsave如果不该设定的话默认save到同一个文件夹里
#下面我设定存在此文件夹下面的一个result folder里面
weather_plot1 =
  weather_df %>% 
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_point(alpha = .5)

ggsave("./result/weather_plot1.pdf", weather_plot1, width = 8, height = 5)
## Warning: Removed 15 rows containing missing values (geom_point).

What about embedding…

weather_plot1
## Warning: Removed 15 rows containing missing values (geom_point).

Embed at different size

weather_plot1
## Warning: Removed 15 rows containing missing values (geom_point).